{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from __future__ import print_function\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def infoparse(x):\n",
    "    key   = [i.split('=')[0] for i in x.split(';')]\n",
    "    value = [i.split('=')[1] for i in x.split(';')]\n",
    "    return dict(zip(key,value))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def rev_comp(strSeq):\n",
    "\tdicComp = {'A':'T','T':'A','G':'C','C':'G','N':'N'}\n",
    "\tstrCseq = ''\n",
    "\tfor i in strSeq:\n",
    "\t\ttry:\n",
    "\t\t\tstrCseq += dicComp[i.upper()]\n",
    "\t\texcept KeyError:\n",
    "\t\t\tstrCseq += 'N'\n",
    "\t# End of for i\n",
    "\treturn(strCseq[::-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "file_fa = '../../ref/Gmax_275_v2.0.fa'\n",
    "\n",
    "bulk = open(file_fa).read()\n",
    "bulk = bulk.split('>')\n",
    "dicHD2seq = {}\n",
    "for each_bulk in bulk:\n",
    "    if each_bulk.strip() == '':\n",
    "        continue\n",
    "    genename = each_bulk.split('\\n')[0]\n",
    "    seq      = ''.join(each_bulk.split('\\n')[1:])\n",
    "    dicHD2seq[genename] = seq"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  import sys\n",
      "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:18: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
     ]
    }
   ],
   "source": [
    "file_gff = '../../finalout/my_csv.csv.addgene.gff3.sort.gff3.merge.all.gff3.sort.gff3'\n",
    "\n",
    "df_gff = pd.read_csv(file_gff,sep='\\t',header=None,comment='#')\n",
    "\n",
    "mask        = (df_gff[2] == 'gene')\n",
    "df_gff_gene = df_gff[mask]      \n",
    "df_gff_gene['genename'] = df_gff_gene[8].apply(lambda x:infoparse(x)['ID'])\n",
    "\n",
    "mask        = (df_gff[2] == 'mRNA')\n",
    "df_gff_mRNA = df_gff[mask]\n",
    "mRNA_ID     = df_gff_mRNA[8].apply(lambda x : infoparse(x)['ID'])\n",
    "mRNA_Parent = df_gff_mRNA[8].apply(lambda x : infoparse(x)['Parent'])\n",
    "dict2g = dict(zip(mRNA_ID,mRNA_Parent))\n",
    "\n",
    "mask = (df_gff[2] == 'CDS')\n",
    "df_gff_cds = df_gff[mask]      \n",
    "\n",
    "df_gff_cds['genename'] = df_gff_cds[8].apply(lambda x : infoparse(x)['Parent'])\n",
    "\n",
    "df_gff_cds_index = df_gff_cds.set_index('genename')\n",
    "\n",
    "df_gff_cds_index = df_gff_cds_index.sort_values(3, ascending=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>genename</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Glyma.U007300.1.Wm82.a2.v1</th>\n",
       "      <td>scaffold_1793</td>\n",
       "      <td>phytozomev10</td>\n",
       "      <td>CDS</td>\n",
       "      <td>8</td>\n",
       "      <td>15</td>\n",
       "      <td>.</td>\n",
       "      <td>+</td>\n",
       "      <td>0</td>\n",
       "      <td>ID=Glyma.U007300.1.Wm82.a2.v1.CDS.1;Parent=Gly...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Gene.156639::STRG.66472.2::g.156639::m.156639</th>\n",
       "      <td>scaffold_486</td>\n",
       "      <td>ST/TD/KYJ</td>\n",
       "      <td>CDS</td>\n",
       "      <td>15</td>\n",
       "      <td>356</td>\n",
       "      <td>.</td>\n",
       "      <td>-</td>\n",
       "      <td>0</td>\n",
       "      <td>ID=cds.Gene.156639::STRG.66472.2::g.156639::m....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Gene.156641::STRG.66472.1::g.156641::m.156641</th>\n",
       "      <td>scaffold_486</td>\n",
       "      <td>ST/TD/KYJ</td>\n",
       "      <td>CDS</td>\n",
       "      <td>15</td>\n",
       "      <td>356</td>\n",
       "      <td>.</td>\n",
       "      <td>-</td>\n",
       "      <td>0</td>\n",
       "      <td>ID=cds.Gene.156641::STRG.66472.1::g.156641::m....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Glyma.U039600.Wm82.a2.v1.st.1</th>\n",
       "      <td>scaffold_484</td>\n",
       "      <td>ST/TD/KYJ</td>\n",
       "      <td>CDS</td>\n",
       "      <td>20</td>\n",
       "      <td>72</td>\n",
       "      <td>.</td>\n",
       "      <td>-</td>\n",
       "      <td>2</td>\n",
       "      <td>ID=cds.Glyma.U039600.Wm82.a2.v1.st.1.4;Parent=...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Glyma.U008600.1.Wm82.a2.v1</th>\n",
       "      <td>scaffold_1990</td>\n",
       "      <td>phytozomev10</td>\n",
       "      <td>CDS</td>\n",
       "      <td>33</td>\n",
       "      <td>160</td>\n",
       "      <td>.</td>\n",
       "      <td>-</td>\n",
       "      <td>2</td>\n",
       "      <td>ID=Glyma.U008600.1.Wm82.a2.v1.CDS.4;Parent=Gly...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                           0             1  \\\n",
       "genename                                                                     \n",
       "Glyma.U007300.1.Wm82.a2.v1                     scaffold_1793  phytozomev10   \n",
       "Gene.156639::STRG.66472.2::g.156639::m.156639   scaffold_486     ST/TD/KYJ   \n",
       "Gene.156641::STRG.66472.1::g.156641::m.156641   scaffold_486     ST/TD/KYJ   \n",
       "Glyma.U039600.Wm82.a2.v1.st.1                   scaffold_484     ST/TD/KYJ   \n",
       "Glyma.U008600.1.Wm82.a2.v1                     scaffold_1990  phytozomev10   \n",
       "\n",
       "                                                 2   3    4  5  6  7  \\\n",
       "genename                                                               \n",
       "Glyma.U007300.1.Wm82.a2.v1                     CDS   8   15  .  +  0   \n",
       "Gene.156639::STRG.66472.2::g.156639::m.156639  CDS  15  356  .  -  0   \n",
       "Gene.156641::STRG.66472.1::g.156641::m.156641  CDS  15  356  .  -  0   \n",
       "Glyma.U039600.Wm82.a2.v1.st.1                  CDS  20   72  .  -  2   \n",
       "Glyma.U008600.1.Wm82.a2.v1                     CDS  33  160  .  -  2   \n",
       "\n",
       "                                                                                               8  \n",
       "genename                                                                                          \n",
       "Glyma.U007300.1.Wm82.a2.v1                     ID=Glyma.U007300.1.Wm82.a2.v1.CDS.1;Parent=Gly...  \n",
       "Gene.156639::STRG.66472.2::g.156639::m.156639  ID=cds.Gene.156639::STRG.66472.2::g.156639::m....  \n",
       "Gene.156641::STRG.66472.1::g.156641::m.156641  ID=cds.Gene.156641::STRG.66472.1::g.156641::m....  \n",
       "Glyma.U039600.Wm82.a2.v1.st.1                  ID=cds.Glyma.U039600.Wm82.a2.v1.st.1.4;Parent=...  \n",
       "Glyma.U008600.1.Wm82.a2.v1                     ID=Glyma.U008600.1.Wm82.a2.v1.CDS.4;Parent=Gly...  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_gff_cds_index.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "i = 0\n",
    "dic = {'transcriptname' : [],\n",
    "       'strand' : [],\n",
    "       'CDSloc' : [],\n",
    "       'CDSseq' : []}\n",
    "for genename in set(df_gff_cds_index.index):\n",
    "    \n",
    "    dic['transcriptname'].append(genename)\n",
    "    \n",
    "    edf = df_gff_cds_index.loc[genename]\n",
    "    if str(type(edf)) == \"<class 'pandas.core.frame.DataFrame'>\":\n",
    "        CDS_list = zip(list(edf[3]),list(edf[4]))\n",
    "        chromosome = edf[0].values[0]\n",
    "        strand     = edf[6].values[0]\n",
    "    else:\n",
    "        CDS_list = zip([edf[3]],[edf[4]])\n",
    "        chromosome = edf[0]\n",
    "        strand     = edf[6]\n",
    "    \n",
    "    dic['strand'].append(strand)\n",
    "    cdsseq = ''\n",
    "    for l,r in CDS_list:\n",
    "        #print l,r\n",
    "        cdsseq += dicHD2seq[chromosome][l-1:r]\n",
    "    if strand == '+':\n",
    "        pass\n",
    "    else: \n",
    "        cdsseq = rev_comp(cdsseq)\n",
    "    dic['CDSloc'].append(CDS_list)\n",
    "    dic['CDSseq'].append(cdsseq)\n",
    "    #print genename,chromosome,strand,CDS_list,cdsseq\n",
    "    #if i == 10:\n",
    "    #    break\n",
    "    #i += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df_cds = pd.DataFrame(dic)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df_cds['gene'] = df_cds['transcriptname'].apply(lambda x : dict2g[x])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CDSloc</th>\n",
       "      <th>CDSseq</th>\n",
       "      <th>strand</th>\n",
       "      <th>transcriptname</th>\n",
       "      <th>gene</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>&lt;zip object at 0x7fa3e6b7fc48&gt;</td>\n",
       "      <td>ATGGCACTTGCTGAAACAAACATGAAAAGCTCTCTGCAAAATCGCA...</td>\n",
       "      <td>-</td>\n",
       "      <td>Glyma.06G011500.1.Wm82.a2.v1</td>\n",
       "      <td>Glyma.06G011500.Wm82.a2.v1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>&lt;zip object at 0x7fa3e6b7ff08&gt;</td>\n",
       "      <td>ATGTCTGCGTCCGAAGCGGAGAAGAAGGTCGAGGAGGAAGAGAAGA...</td>\n",
       "      <td>-</td>\n",
       "      <td>Glyma.06G025900.1.Wm82.a2.v1</td>\n",
       "      <td>Glyma.06G025900.Wm82.a2.v1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>&lt;zip object at 0x7fa3e6b81248&gt;</td>\n",
       "      <td>ATGGCAATTAGTGTTATGAAGGAGGCCAACATGAAAGGGCGTATGG...</td>\n",
       "      <td>-</td>\n",
       "      <td>Glyma.15G228000.1.Wm82.a2.v1</td>\n",
       "      <td>Glyma.15G228000.Wm82.a2.v1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>&lt;zip object at 0x7fa3e6b7fbc8&gt;</td>\n",
       "      <td>ATGTCTCCTATTCAACTTGGCAGAGGGAAGAATTGGCTAAGATACT...</td>\n",
       "      <td>+</td>\n",
       "      <td>Glyma.18G247600.1.Wm82.a2.v1</td>\n",
       "      <td>Glyma.18G247600.Wm82.a2.v1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>&lt;zip object at 0x7fa3e6b81348&gt;</td>\n",
       "      <td>ATGAGTAGAAGAAGCATGCCTGCAAAATTGTTGCTCACAACTTTGT...</td>\n",
       "      <td>-</td>\n",
       "      <td>Glyma.14G085400.1.Wm82.a2.v1</td>\n",
       "      <td>Glyma.14G085400.Wm82.a2.v1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                           CDSloc  \\\n",
       "0  <zip object at 0x7fa3e6b7fc48>   \n",
       "1  <zip object at 0x7fa3e6b7ff08>   \n",
       "2  <zip object at 0x7fa3e6b81248>   \n",
       "3  <zip object at 0x7fa3e6b7fbc8>   \n",
       "4  <zip object at 0x7fa3e6b81348>   \n",
       "\n",
       "                                              CDSseq strand  \\\n",
       "0  ATGGCACTTGCTGAAACAAACATGAAAAGCTCTCTGCAAAATCGCA...      -   \n",
       "1  ATGTCTGCGTCCGAAGCGGAGAAGAAGGTCGAGGAGGAAGAGAAGA...      -   \n",
       "2  ATGGCAATTAGTGTTATGAAGGAGGCCAACATGAAAGGGCGTATGG...      -   \n",
       "3  ATGTCTCCTATTCAACTTGGCAGAGGGAAGAATTGGCTAAGATACT...      +   \n",
       "4  ATGAGTAGAAGAAGCATGCCTGCAAAATTGTTGCTCACAACTTTGT...      -   \n",
       "\n",
       "                 transcriptname                        gene  \n",
       "0  Glyma.06G011500.1.Wm82.a2.v1  Glyma.06G011500.Wm82.a2.v1  \n",
       "1  Glyma.06G025900.1.Wm82.a2.v1  Glyma.06G025900.Wm82.a2.v1  \n",
       "2  Glyma.15G228000.1.Wm82.a2.v1  Glyma.15G228000.Wm82.a2.v1  \n",
       "3  Glyma.18G247600.1.Wm82.a2.v1  Glyma.18G247600.Wm82.a2.v1  \n",
       "4  Glyma.14G085400.1.Wm82.a2.v1  Glyma.14G085400.Wm82.a2.v1  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_cds.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df_cds_index = df_cds.set_index(['gene','transcriptname'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_cds_index.to_pickle(file_gff+'.cdsseq.pk')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CDSloc</th>\n",
       "      <th>CDSseq</th>\n",
       "      <th>strand</th>\n",
       "      <th>transcriptname</th>\n",
       "      <th>gene</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>&lt;zip object at 0x7fa3e6b7fc48&gt;</td>\n",
       "      <td>ATGGCACTTGCTGAAACAAACATGAAAAGCTCTCTGCAAAATCGCA...</td>\n",
       "      <td>-</td>\n",
       "      <td>Glyma.06G011500.1.Wm82.a2.v1</td>\n",
       "      <td>Glyma.06G011500.Wm82.a2.v1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>&lt;zip object at 0x7fa3e6b7ff08&gt;</td>\n",
       "      <td>ATGTCTGCGTCCGAAGCGGAGAAGAAGGTCGAGGAGGAAGAGAAGA...</td>\n",
       "      <td>-</td>\n",
       "      <td>Glyma.06G025900.1.Wm82.a2.v1</td>\n",
       "      <td>Glyma.06G025900.Wm82.a2.v1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>&lt;zip object at 0x7fa3e6b81248&gt;</td>\n",
       "      <td>ATGGCAATTAGTGTTATGAAGGAGGCCAACATGAAAGGGCGTATGG...</td>\n",
       "      <td>-</td>\n",
       "      <td>Glyma.15G228000.1.Wm82.a2.v1</td>\n",
       "      <td>Glyma.15G228000.Wm82.a2.v1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>&lt;zip object at 0x7fa3e6b7fbc8&gt;</td>\n",
       "      <td>ATGTCTCCTATTCAACTTGGCAGAGGGAAGAATTGGCTAAGATACT...</td>\n",
       "      <td>+</td>\n",
       "      <td>Glyma.18G247600.1.Wm82.a2.v1</td>\n",
       "      <td>Glyma.18G247600.Wm82.a2.v1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>&lt;zip object at 0x7fa3e6b81348&gt;</td>\n",
       "      <td>ATGAGTAGAAGAAGCATGCCTGCAAAATTGTTGCTCACAACTTTGT...</td>\n",
       "      <td>-</td>\n",
       "      <td>Glyma.14G085400.1.Wm82.a2.v1</td>\n",
       "      <td>Glyma.14G085400.Wm82.a2.v1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                           CDSloc  \\\n",
       "0  <zip object at 0x7fa3e6b7fc48>   \n",
       "1  <zip object at 0x7fa3e6b7ff08>   \n",
       "2  <zip object at 0x7fa3e6b81248>   \n",
       "3  <zip object at 0x7fa3e6b7fbc8>   \n",
       "4  <zip object at 0x7fa3e6b81348>   \n",
       "\n",
       "                                              CDSseq strand  \\\n",
       "0  ATGGCACTTGCTGAAACAAACATGAAAAGCTCTCTGCAAAATCGCA...      -   \n",
       "1  ATGTCTGCGTCCGAAGCGGAGAAGAAGGTCGAGGAGGAAGAGAAGA...      -   \n",
       "2  ATGGCAATTAGTGTTATGAAGGAGGCCAACATGAAAGGGCGTATGG...      -   \n",
       "3  ATGTCTCCTATTCAACTTGGCAGAGGGAAGAATTGGCTAAGATACT...      +   \n",
       "4  ATGAGTAGAAGAAGCATGCCTGCAAAATTGTTGCTCACAACTTTGT...      -   \n",
       "\n",
       "                 transcriptname                        gene  \n",
       "0  Glyma.06G011500.1.Wm82.a2.v1  Glyma.06G011500.Wm82.a2.v1  \n",
       "1  Glyma.06G025900.1.Wm82.a2.v1  Glyma.06G025900.Wm82.a2.v1  \n",
       "2  Glyma.15G228000.1.Wm82.a2.v1  Glyma.15G228000.Wm82.a2.v1  \n",
       "3  Glyma.18G247600.1.Wm82.a2.v1  Glyma.18G247600.Wm82.a2.v1  \n",
       "4  Glyma.14G085400.1.Wm82.a2.v1  Glyma.14G085400.Wm82.a2.v1  "
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_cds.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "m = ['st' in x for x in df_cds['transcriptname']]\n",
    "df_cds_st = df_cds[m]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "m = ['Glyma' not in x for x in df_cds['transcriptname']]\n",
    "df_cds_new = df_cds[m]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CDSloc</th>\n",
       "      <th>CDSseq</th>\n",
       "      <th>strand</th>\n",
       "      <th>transcriptname</th>\n",
       "      <th>gene</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>&lt;zip object at 0x7fa3e6b84348&gt;</td>\n",
       "      <td>AAACTTCATAGTATTGGAAAGAAAGTTGCATACATATTGAAGATAT...</td>\n",
       "      <td>.</td>\n",
       "      <td>Gene.21026::STRG.7013.1::g.21026::m.21026</td>\n",
       "      <td>Gene.21026::STRG</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>76</th>\n",
       "      <td>&lt;zip object at 0x7fa3e6b960c8&gt;</td>\n",
       "      <td>AAAAATTCACCTCAAAAACAACCAAATAGCAGTAGCAACAATTCAA...</td>\n",
       "      <td>.</td>\n",
       "      <td>Gene.86153::STRG.34756.1::g.86153::m.86153</td>\n",
       "      <td>Gene.86153::STRG</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>260</th>\n",
       "      <td>&lt;zip object at 0x7fa3e6b98a88&gt;</td>\n",
       "      <td>CAGGAACAACCAGAATTCAACATCCAACCCATACAAATAATTCCTG...</td>\n",
       "      <td>.</td>\n",
       "      <td>Gene.72616::STRG.27688.1::g.72616::m.72616</td>\n",
       "      <td>Gene.72616::STRG</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>263</th>\n",
       "      <td>&lt;zip object at 0x7fa3e6b25b08&gt;</td>\n",
       "      <td>ATGGTGTTGATCAGAGAAGAAGAGATAGTGGAGGAGATGAGTGAAG...</td>\n",
       "      <td>+</td>\n",
       "      <td>Gene.138207::STRG.56225.2::g.138207::m.138207</td>\n",
       "      <td>Gene.138207::STRG</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>273</th>\n",
       "      <td>&lt;zip object at 0x7fa3e6b43488&gt;</td>\n",
       "      <td>AAGATGGAAAAATCTGAAAAGAAAAATCATAGTGAGAAGAAACTAA...</td>\n",
       "      <td>+</td>\n",
       "      <td>Gene.149998::STRG.63653.1::g.149998::m.149998</td>\n",
       "      <td>Gene.149998::STRG</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             CDSloc  \\\n",
       "39   <zip object at 0x7fa3e6b84348>   \n",
       "76   <zip object at 0x7fa3e6b960c8>   \n",
       "260  <zip object at 0x7fa3e6b98a88>   \n",
       "263  <zip object at 0x7fa3e6b25b08>   \n",
       "273  <zip object at 0x7fa3e6b43488>   \n",
       "\n",
       "                                                CDSseq strand  \\\n",
       "39   AAACTTCATAGTATTGGAAAGAAAGTTGCATACATATTGAAGATAT...      .   \n",
       "76   AAAAATTCACCTCAAAAACAACCAAATAGCAGTAGCAACAATTCAA...      .   \n",
       "260  CAGGAACAACCAGAATTCAACATCCAACCCATACAAATAATTCCTG...      .   \n",
       "263  ATGGTGTTGATCAGAGAAGAAGAGATAGTGGAGGAGATGAGTGAAG...      +   \n",
       "273  AAGATGGAAAAATCTGAAAAGAAAAATCATAGTGAGAAGAAACTAA...      +   \n",
       "\n",
       "                                    transcriptname               gene  \n",
       "39       Gene.21026::STRG.7013.1::g.21026::m.21026   Gene.21026::STRG  \n",
       "76      Gene.86153::STRG.34756.1::g.86153::m.86153   Gene.86153::STRG  \n",
       "260     Gene.72616::STRG.27688.1::g.72616::m.72616   Gene.72616::STRG  \n",
       "263  Gene.138207::STRG.56225.2::g.138207::m.138207  Gene.138207::STRG  \n",
       "273  Gene.149998::STRG.63653.1::g.149998::m.149998  Gene.149998::STRG  "
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_cds_new.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open(file_gff+'.cds.fa','w') as f:\n",
    "    for ix in df_cds.index:\n",
    "        print('>'+df_cds.loc[ix]['transcriptname'],file=f)\n",
    "        print(df_cds.loc[ix]['CDSseq'],file=f)\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(file_gff+'.cds_st.fa','w') as f:\n",
    "    for ix in df_cds_st.index:\n",
    "        print('>'+df_cds_st.loc[ix]['transcriptname'],file=f)\n",
    "        print(df_cds_st.loc[ix]['CDSseq'],file=f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(file_gff+'.cds_new.fa','w') as f:\n",
    "    for ix in df_cds_new.index:\n",
    "        print('>'+df_cds_new.loc[ix]['transcriptname'],file=f)\n",
    "        print(df_cds_new.loc[ix]['CDSseq'],file=f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
